/*******************************************************************************
Last update: March 16, 2023
********************************************************************************/

cls
clear all
local dir "~\Dropbox\Working Papers\Distinction Effect"
cd "`dir'"

qui do "Does\Data Processing\0. Cleaning Programs.do"

/*******************************************************************************
1. Merge Saber Pro to Spadies: 
********************************************************************************/

/********************
(i) Merge Using IDs
*********************/

//Saber Pro Data
use "Data\Intermediates\SaberPro_2006-2010.dta", clear

keep exam_* national_id* spro_*name* spro_birthdate spro_gender spro_cllgcode

bys national_id* : egen aux = min(exam_time)
keep if exam_time == aux
drop aux

destring national_id, replace force
drop if national_id == . //71 obs
duplicates drop national_id*, force

tempfile SaberPro
save `SaberPro', replace

//SPADIES
use "Data\Originals\SPADIES\Spadies 2016.dta", clear

gen national_id_type = doc_tipo
gen double national_id = doc_num
merge m:1 national_id* using `SaberPro'
keep if _merge == 3 | _merge == 2

//Not Merged IDs
preserve
keep if _merge == 2
keep exam_* national_id* spro_*name* spro_birthdate spro_gender spro_cllgcode
tempfile NotMerged
save `NotMerged', replace
restore

keep if _merge == 3
drop _merge

//First and Last Observed Enrolled in College
bys national_id* : egen spa_fstcllg = min(periodo)
bys national_id* : egen spa_lstcllg = max(periodo)

//First Observed in College Reported at the Exam
gen aux = periodo if spro_cllgcode == ies
bys national_id* : egen spro_fstcllg = min(aux)
bys national_id* : egen spro_lstcllg = max(aux)
drop aux

//Total Courses
bys national_id* : egen spa_classtaken = total(materias_tomadas)
bys national_id* : egen spa_classpassed = total(materias_aprobadas)

//Academic Support
bys national_id* : egen spa_acdmspprt = total(apo_aca)
bys national_id* : egen spa_fincspprt = total(apo_fin)
bys national_id* : egen spa_otherspprt = total(apo_otr)

//ICETEX
bys national_id* : egen spa_licetex = total(ictx_l)
bys national_id* : egen spa_micetex = total(ictx_m)
bys national_id* : egen spa_aicetex = total(ictx_a)

//Other Covariates
rename ies spa_cllgcode
rename sexo spa_gender
rename edu_madre_icfes spa_mothereduc
rename estrato spa_stratum
rename puntaje_icfes spa_sb11score
rename periodo_icfes spa_sb11time
rename grado_per spa_gradtime

//Clean Merge Based on IDs: Using Names, Gender, and Birthday
duplicates drop national*, force

gen spro_name = spro_name1 + " " + spro_name2 + " " + spro_surname1 + " " + spro_surname2
replace spro_name = itrim(trim(spro_name))
gen spa_name = nombre + " " + apellido
replace spa_name = itrim(trim(spa_name))
spell spa_name spro_name, clean(spa_name spro_name)

tostring nac*, replace 
gen spa_birthdate = date(nac_dia+"/"+nac_mes+"/"+nac_ano, "DMY")
format spa_birthdate %td

//Names comparison
matchit spa_name spro_name , sim(ngram,2) generate(phone_score)

//Append Dropped Obs to Not Merged Obs
preserve
cap drop aux
gen aux = 1 if phone_score > 0.7
replace aux = 1 if phone_score > 0.6 & phone_score <= 0.7 /// 
& spro_birthdate == spa_birthdate & spa_birthdate != . ///
& spro_gender == spa_gender & spa_gender != .
drop if aux == 1

keep exam_* national_id* spro_*name* spro_birthdate spro_gender spro_cllgcode
append using `NotMerged'
save `NotMerged', replace
restore

//Clean Obs based on Merge ID
cap drop aux
gen aux = 1 if phone_score > 0.7
replace aux = 1 if phone_score > 0.6 & phone_score <= 0.7 /// 
& spro_birthdate == spa_birthdate & spa_birthdate != . ///
& spro_gender == spa_gender & spa_gender != .
drop if aux == .
drop aux

keep national* doc* spa_*

save "Data\Intermediates\SaberPro_Spadies_id.dta", replace

/************************************************
(ii) Merge Using College ID, Birthday, and Gender
*************************************************/

use `NotMerged', clear

rename spro_gender sexo
rename spro_cllgcode ies
gen nac_dia = day(spro_birthdate)
gen nac_mes = month(spro_birthdate)
gen nac_ano = year(spro_birthdate)

joinby ies nac_* sexo using "Data\Originals\SPADIES\Spadies 2016.dta"

gen double spro_cllgcode = ies

//Clean Merge
duplicates drop national_id national_id_type doc_tipo doc_num, force

cap drop spro_name 
gen spro_name = spro_name1 + " " + spro_name2 + " " + spro_surname1 + " " + spro_surname2
replace spro_name = itrim(trim(spro_name))
gen spa_name = nombre + " " + apellido
replace spa_name = itrim(trim(spa_name))
spell spa_name spro_name, clean(spa_name spro_name)

matchit spa_name spro_name , sim(ngram,2) generate(phone_score)

bys national* : egen double aux = max(phone_score)
keep if phone_score == aux
drop aux

keep if phone_score > 0.85

bys doc_* : egen double aux = max(phone_score)
keep if phone_score == aux
drop aux

duplicates tag doc_*, gen(dup)
keep if dup == 0
drop dup

duplicates tag national*, gen(dup)
drop if national_id != doc_num & dup > 0
drop dup

keep national* doc* spro_cllgcode
merge 1:m doc* using "Data\Originals\SPADIES\Spadies 2016.dta"
keep if _merge == 3
drop _merge

//First and Last Observed Enrolled in College
bys national_id* : egen spa_fstcllg = min(periodo)
bys national_id* : egen spa_lstcllg = max(periodo)

//First Observed in College Reported at the Exam
gen aux = periodo if spro_cllgcode == ies
bys national_id* : egen spro_fstcllg = min(aux)
bys national_id* : egen spro_lstcllg = max(aux)
drop aux

//Total Courses
bys national_id* : egen spa_classtaken = total(materias_tomadas)
bys national_id* : egen spa_classpassed = total(materias_aprobadas)

//Academic Support
bys national_id* : egen spa_acdmspprt = total(apo_aca)
bys national_id* : egen spa_fincspprt = total(apo_fin)
bys national_id* : egen spa_otherspprt = total(apo_otr)

//ICETEX
bys national_id* : egen spa_licetex = total(ictx_l)
bys national_id* : egen spa_micetex = total(ictx_m)
bys national_id* : egen spa_aicetex = total(ictx_a)

//Other Covariates
rename ies spa_cllgcode
rename sexo spa_gender
rename edu_madre_icfes spa_mothereduc
rename estrato spa_stratum
rename puntaje_icfes spa_sb11score
rename periodo_icfes spa_sb11time
rename grado_per spa_gradtime

gen spa_name = nombre + " " + apellido
replace spa_name = itrim(trim(spa_name))
spell spa_name, clean(spa_name)

tostring nac*, replace 
gen spa_birthdate = date(nac_dia+"/"+nac_mes+"/"+nac_ano, "DMY")
format spa_birthdate %td

keep national* doc* spa_*
duplicates drop national*, force

save "Data\Intermediates\SaberPro_Spadies_fuzzy.dta", replace

use "Data\Intermediates\SaberPro_Spadies_id.dta", clear
append using "Data\Intermediates\SaberPro_Spadies_fuzzy.dta"
save "Data\Intermediates\SaberPro_Spadies.dta", replace

/*******************************************************************************
2. Complete Data
********************************************************************************/

use "Data\Intermediates\SaberPro_Spadies.dta", clear

keep national_id_type national_id doc_tipo doc_num

tostring national_id, replace format(%20.0g)
merge 1:m national* using "Data\Intermediates\SaberPro_2006-2010.dta", keepusing(spro_cllgcode exam_time)
keep if _merge == 3
drop _merge

duplicates drop national*, force
duplicates drop doc*, force
merge 1:m doc* using "Data\Originals\SPADIES\Spadies 2016.dta"
keep if _merge == 3
drop _merge

replace exam_time = 20101 if exam_time == 20102
replace exam_time = 20102 if exam_time == 20103

gen aux = materias_tomadas if exam_time <= periodo & spro_cllgcode == ies
bys national* : egen spro_classtaken = total(aux)
drop aux

gen aux = materias_aprobadas if exam_time <= periodo & spro_cllgcode == ies
bys national* : egen spro_classpassed = total(aux)
drop aux

keep national* spro_class*
duplicates drop national*, force

//destring national_id, replace
merge 1:m national* using "Data\Intermediates\SaberPro_Spadies.dta"
keep if _merge == 3
drop _merge

order spro_classtaken spro_classpassed, after(spa_classpassed)
order spa_name spa_birthdate, after(doc_num)
order spa_cllgcode, after(spa_sb11time)

//tostring national_id, replace format(%20.0g)
save "Data\Intermediates\SaberPro_Spadies.dta", replace
